########################################
#### figure_4.R: Generates Figure 4 ####
########################################

data_rf_full = dta_overtime |> 
  mutate(
    accuracy_tri = case_when(
      accuracy %in% 1:2 ~ 1,
      accuracy == 3 ~ 0,
      accuracy %in% 4:5 ~ -1,
      .default = NA_integer_
    ),
    accuracy_dgr = case_when(
      accuracy %in% 1:3 ~ 0, # Flipped
      accuracy %in% 4:5 ~ 1,
      .default = NA_integer_
    ),
    accuracy_agr = case_when(
      accuracy %in% 1:2 ~ 1,
      accuracy %in% 3:5 ~ 0,
      .default = NA_integer_
    ),
    is_denier = as.factor(accuracy_dgr),
    period = case_when(
      date >= as.Date("2022-11-08") & date <= as.Date("2022-11-29") ~ "Post-Election",
      date >= as.Date("2023-01-12") & date <= as.Date("2023-02-03") ~ "Height",
      date >= as.Date("2024-01-24") & date <= as.Date("2024-02-15") ~ "Current",
      .default = "Other"
    ),
    maga = case_when(
      pid == "Democrat" ~ 0,
      maga == 1 ~ 1,
      .default = 0
    ),
    pid = as.factor(pid),
    age = 2022 - birthyr,
    strong = pid7 %in% c(1,7),
    male = ifelse(gender == 1, 1, 0),
    white = ifelse(race == 1, 1, 0),
    college = ifelse(educ %in% 5:6, 1, 0),
    married = ifelse(marstat == 1, 1, 0),
    children = ifelse(child18 == 1, 1, 0),
    employed = case_when(
      employ == 1 ~ 2,
      employ == 2 ~ 1,
      .default = 0 
    ),
    bornagain = ifelse(pew_bornagain == 1, 1, 0),
    voted_2020 = ifelse(presvote20post == 6, 0, 1),
    registered = ifelse(votereg == 1, 1, 0),
    inc_terc = case_when(
      faminc_new %in% 1:5 ~ "Lower",
      faminc_new %in% 6:9 ~ "Middle",
      faminc_new %in% 10:16 ~ "Upper",
      .default = "Middle" # Median imputation
    ),
    # Attitudes
    trust_binary = ifelse(general_trust == 1, 1, 0),
    vote_import_binary = ifelse(vote_importance %in% 1:2, 1, 0),
    corruption_binary = ifelse(institutional_corruption %in% 4:5, 1, 0),
    response_binary = ifelse(institutional_response %in% 4:5, 1, 0),
    pride_binary = ifelse(pride %in% 1:2, 1, 0),
    fair_binary = ifelse(fair_treatment %in% 1:2, 1, 0),
    democracy_binary = ifelse(democracy_importance %in% 1:2, 1, 0)
  ) |> 
  select(
    # DV
    is_denier, period,
    # Demos
    pid, maga, strong, age, male, white, college, married,
    children, employed, bornagain, voted_2020, registered, inc_terc,
    # Attitudes
    democrat_therm_1, republican_therm_1, ends_with("_binary"), weight
  ) |> 
  drop_na()

rf_clean = \(data, pd, demo_only = F){
  # Subset and Split
  if(demo_only){
    data_rf = data |> 
      filter(period == pd) |> 
      select(is_denier, weight, pid, inc_terc, employed, male, children,
             college, registered, bornagain, voted_2020, married, white, age)
  } else {
    data_rf = data |> 
      filter(period == pd) |> 
      select(-period)
  }
  ## Random Forest ##
  set.seed(120)
  data_split = initial_split(data_rf, prop = .8)
  data_train = training(data_split)
  data_test = testing(data_split)
  data_folds = vfold_cv(data_train)
  # Define Recipe
  rec_rf = recipe(is_denier ~ ., data = data_train) |> 
    step_rm(weight) |> 
    step_dummy(pid) |> 
    step_dummy(inc_terc)
  # Define Model
  rf_model = rand_forest(mtry = 9,
                         min_n = 32,
                         trees = 1000) |> 
    set_engine("ranger",
               num.threads = parallel::detectCores(),
               importance = "impurity_corrected",
               seed = 613) |> 
    set_mode("classification")
  # Set workflow(s)
  rf_wflow = workflow() |> 
    add_model(rf_model) |> 
    add_recipe(rec_rf)
  # Fit Models
  rf_fitted = rf_wflow |> 
    last_fit(data_split)
  return(rf_fitted)
}

rf_train = \(data, pd, demo_only = F){
  # Subset and Split
  if(demo_only){
    data_rf = data |> 
      filter(period == pd) |> 
      select(is_denier, weight, pid, inc_terc, employed, male, children,
             college, registered, bornagain, voted_2020, married, white, age)
  } else {
    data_rf = data |> 
      filter(period == pd) |> 
      select(-period)
  }
  ## Random Forest ##
  set.seed(120)
  data_split = initial_split(data_rf, prop = .8)
  data_train = training(data_split)
  return(data_train)
}

rf_res_vip = map_dfr(c("Post-Election","Height","Current"), \(d){
  rf_fitted = rf_clean(data_rf_full, d)
  vip_out = rf_fitted |> 
    extract_fit_parsnip() |> 
    vip::vi() |> 
    mutate(period = d)
  return(vip_out)
})

rf_res_vip_demo = map_dfr(c("Post-Election","Height","Current"), \(d){
  rf_fitted = rf_clean(data_rf_full, d, T)
  vip_out = rf_fitted |> 
    extract_fit_parsnip() |> 
    vip::vi() |> 
    mutate(period = d)
  return(vip_out)
})

rf_res_pred = map_dfr(c("Post-Election","Height","Current"), \(d){
  rf_fitted = rf_clean(data_rf_full, d)
  data_train = rf_train(data_rf_full, d)
  pred_out = rf_fitted |> 
    collect_metrics() |> 
    mutate(period = d) |> 
    add_row(.metric = "base_accuracy",
            period = d,
            .estimate = prop.table(table(data_train$is_denier))[1])
  return(pred_out)
})

rf_res_pred_demo = map_dfr(c("Post-Election","Height","Current"), \(d){
  rf_fitted = rf_clean(data_rf_full, d, T)
  data_train = rf_train(data_rf_full, d, T)
  pred_out = rf_fitted |> 
    collect_metrics() |> 
    mutate(period = d) |> 
    add_row(.metric = "base_accuracy",
            period = d,
            .estimate = prop.table(table(data_train$is_denier))[1])
  return(pred_out)
})

# Appendix Figure S6
fig_s6 = rf_res_pred |> 
  mutate(.metric = case_when(
    .metric == "accuracy" ~ "RF Accuracy",
    .metric == "roc_auc" ~ "RF ROC-AUC",
    .metric == "base_accuracy" ~ "Base Accuracy"
  )) |> 
  ggplot(aes(x = .estimate, y = reorder(.metric,.estimate), fill = .metric)) +
  geom_bar(stat = 'identity', show.legend = F) +
  geom_label(aes(label = paste0(round(.estimate*100,1),"%")), fill = "white") +
  facet_wrap(~ period, ncol = 3) +
  scale_x_continuous(labels = scales::percent, limits = c(0,1)) +
  scale_fill_manual(values =  c("#440154FF", "#2A788EFF", "#7AD151FF")) +
  labs(x = "Estimate", y = NULL, caption = "Base accuracy is the accuracy resulting from always predicting the majority class") +
  theme_prl()

ggsave(here("Figures", "figure_s6.pdf"), fig_s6,
       dpi = 600, units = "in", height = 4, width = 8)

# OLS Results

data_full_ols = data_rf_full |> 
  mutate(across(all_of(c("age","democrat_therm_1","republican_therm_1")), ~scale(.x)))

ols_res = map_dfr(c("Post-Election","Height","Current"), \(d){
  # Subset and Split
  data_rf = data_full_ols |> 
    filter(period == d) |> 
    select(-period)
  # Select vars
  vars = names(data_rf)[-which(names(data_rf) %in% c("is_denier", "weight"))]
  bv_res = map_dfr(vars, \(x){
    f = as.formula(paste0("is_denier ~ ", x))
    print(x)
    mod = svyglm(f, data_rf |>
                   as_survey_design(weights = weight) |> 
                   mutate(is_denier = as.numeric(is_denier) - 1)) 
    broom::tidy(mod) |> 
      filter(term != "(Intercept)") |> 
      mutate(iv = x)
  }) |> 
    mutate(iv = case_when(
      term == "pidRepublican" ~ "pid_Republican",
      term == "pidIndependent" ~ "pid_Independent",
      term == "inc_tercMiddle" ~ "inc_terc_Middle",
      term == "inc_tercUpper" ~ "inc_terc_Upper",
      .default = iv
    ),
    period = d)
  return(bv_res)
})

res = ols_res |> 
  left_join(rf_res_vip, join_by(iv == Variable, period == period)) |> 
  mutate(abs_imp = abs(Importance),
         imp_scaled = (abs_imp - min(abs_imp)) / (max(abs_imp) - min(abs_imp)))

res_demo = ols_res |> 
  left_join(rf_res_vip_demo, join_by(iv == Variable, period == period)) 

fig_4 = res |> 
  mutate(
    term_clean = case_match(
      iv,
      "employed" ~ "Employed",
      "male" ~ "Male",
      "children" ~ "Parent",
      "college" ~ "College-Educated",
      "registered" ~ "Registered Voter",
      "bornagain" ~ "Evangelical",
      "voted_2020" ~ "2020 Voter",
      "married" ~ "Married",
      "white" ~ "White",
      "pid_Republican" ~ "PID: Republican",
      "pid_Independent" ~ "PID: Independent",
      "inc_terc_Middle" ~ "Income (Middle Terc.)",
      "inc_terc_Upper" ~ "Income (Upper Terc.)",
      "strong" ~ "Strong Partisan",
      "age" ~ "Age",
      "trust_binary" ~ "General Trust",
      "vote_import_binary" ~ "Vote Efficacy",
      "democrat_therm_1" ~ "Democrat FT",
      "republican_therm_1" ~ "Republican FT",
      "maga" ~ "MAGA (self-identified)",
      "corruption_binary" ~ "Perceived Corruption",
      "fair_binary" ~ "Fair Political Treatment",
      "pride_binary" ~ "Proud to be American",
      "democracy_binary" ~ "Democracy Importance",
      "response_binary" ~ "Perceived Responsiveness"
    ),
    term_cat = case_when(
      iv %in% c("pid_Republican","pid_Independent","employed","male","children",
                "college","registered","bornagain","voted_2020","married","white",
                "inc_terc_Middle", "inc_terc_Upper","age") ~ "Demographic",
      .default = "Attitudinal"
    )) |>
  ggplot(aes(x = estimate, y = reorder(term_clean,estimate), color = period)) +
  geom_pointrange(aes(size = imp_scaled,
                      xmin = estimate - 1.96*std.error,
                      xmax = estimate + 1.96*std.error),
                  position = position_dodge(.5)) +
  facet_wrap(~ term_cat, ncol = 2, scales = "free_y") +
  scale_x_continuous(labels = scales::percent,
                     limits = c(-.25, .5)) +
  scale_size_continuous(name = "Importance (Scaled 0-1)",
                        range = c(0.5,2)) +
  scale_color_manual(name = "Time Period",
                     values = c("#440154FF", "#2A788EFF", "#7AD151FF")) +
  labs(x = "OLS Coefficient", y = NULL,
       caption = "Continuous variables (age and feeling thermometers) scaled to have mean 0 and standard deviation 1,\nsuch that effects are interpretable in terms of 1 standard deviation.") +
  geom_vline(xintercept = 0, lty = 2, color = "grey25") +
  theme_prl() +
  theme(legend.spacing.y = unit(-0.25, "cm"))

ggsave(here("Figures","figure_4.pdf"), fig_4,
       dpi = 600, units = "in", height = 6, width = 8)

fig_s7 = res_demo |> 
  mutate(
    term_clean = case_match(
      iv,
      "employed" ~ "Employed",
      "male" ~ "Male",
      "children" ~ "Parent",
      "college" ~ "College-Educated",
      "registered" ~ "Registered Voter",
      "bornagain" ~ "Evangelical",
      "voted_2020" ~ "2020 Voter",
      "married" ~ "Married",
      "white" ~ "White",
      "pid_Republican" ~ "PID: Republican",
      "pid_Independent" ~ "PID: Independent",
      "inc_terc_Middle" ~ "Income (Middle Terc.)",
      "inc_terc_Upper" ~ "Income (Upper Terc.)",
      "strong" ~ "Strong Partisan",
      "age" ~ "Age",
      "trust_binary" ~ "General Trust",
      "vote_import_binary" ~ "Vote Efficacy",
      "democrat_therm_1" ~ "Democrat FT",
      "republican_therm_1" ~ "Republican FT",
      "maga" ~ "MAGA (self-identified)",
      "corruption_binary" ~ "Perceived Corruption",
      "fair_binary" ~ "Fair Political Treatment",
      "pride_binary" ~ "Proud to be American",
      "democracy_binary" ~ "Democracy Importance",
      "response_binary" ~ "Perceived Responsiveness"
    ),
    term_cat = case_when(
      iv %in% c("pid_Republican","pid_Independent","employed","male","children",
                "college","registered","bornagain","voted_2020","married","white",
                "inc_terc_Middle", "inc_terc_Upper","age") ~ "Demographic",
      .default = "Attitudinal"
    )) |>
  filter(term_cat == "Demographic") |> 
  mutate(abs_imp = abs(Importance),
         imp_scaled = (abs_imp - min(abs_imp)) / (max(abs_imp) - min(abs_imp))) |> 
  ggplot(aes(x = estimate, y = reorder(term_clean,estimate), color = period)) +
  geom_pointrange(aes(size = imp_scaled,
                      xmin = estimate - 1.96*std.error,
                      xmax = estimate + 1.96*std.error),
                  position = position_dodge(.5)) +
  scale_x_continuous(labels = scales::percent,
                     limits = c(-.25, .5)) +
  scale_size_continuous(name = "Importance (Scaled 0-1)",
                        range = c(0.5,2)) +
  scale_color_manual(name = "Time Period",
                     values = c("#440154FF", "#2A788EFF", "#7AD151FF")) +
  labs(x = "OLS Coefficient", y = NULL,
       caption = "Continuous variables (age and feeling thermometers) scaled to have mean 0 and standard deviation 1,\nsuch that effects are interpretable in terms of 1 standard deviation.") +
  geom_vline(xintercept = 0, lty = 2, color = "grey25") +
  theme_prl() +
  theme(legend.spacing.y = unit(-0.25, "cm"))

ggsave(here("Figures","figure_s7.pdf"), fig_s7,
       dpi = 600, units = "in", height = 6, width = 7)
